library(tidyverse)
mpg
## # A tibble: 234 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 quattro 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 quattro 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 quattro 2 2008 4 manu… 4 20 28 p comp…
## # … with 224 more rows
변수들
manufacturer와 model: 자동차 생산회사와 모델. 1999년에서 2008년 사이의 38가지 모델.displ: 엔진 배기량(liter).year: 생산연도cyl: 자동차의 실린더수(엔진의 배기량을 나타냄)trans: 자동사의 변속방법drv: 자동차의 구동렬. 전륜구동(front wheel, f),후륜구동(rear wheel, r), 사륜구동(four wheel, 4)cty와 hwy: 도시와 고속도로에서의 mpg(갤런당 주행거리 mile/gallon).fl: 연료 타입.class: 자동차 타입,2인승, SUV, 소형차 등.data
aesthetic mapping
적어도 하나의 layer - geom 함수
ggplot(mpg, aes(x = displ, y = hwy)) + geom_point( )
이 코드는 산점도를 생성하는 것으로 세가지 요소는 다음과 같이 정의되어 있다.
mpg.displ, y축 변수에 hwy를 대응 (각 줄데이터를 점 또는 색깔등 각각의 스케일에 따라서 맵핑 시켜주는것)geom_point - 점찍기aes 함수에서 colour, size, shape 지정하기aes(displ, hwy, colour = class)aes(displ, hwy, shape = drv)aes(displ, hwy, size = cyl)ggplot(mpg, aes(displ, hwy, colour = class)) +
geom_point()
# x축 displ y축 hwy, 색구분은 class로해서 점찍어봐
ggplot(mpg, aes(displ, cty, shape = class, col=drv )) +
geom_point()
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 7. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 62 rows containing missing values (geom_point).
# colour -> col로 줄여서
ggplot(mpg, aes(displ, cty, size = cyl)) +
geom_point()
# 실린더 숫자를 점의 크기 size에 매칭
*facet_wrap() 함수를 + 형태로 연결.
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
facet_wrap(~class,ncol=3)
# subplot처럼 여러개를 쪼개서 그려줌 class에따라서 3열로 그려라
waiter가 자신이 일하는 restaurant에서 수개월 동안 일을 하면서 tip과 관련하여 모든 자료
변수들
- total_bill: 각 table에서 지불한 돈(dollar)
- tips: 각 table에서 지불한 팁 (dollar)
- sex: 돈을 낸 사람의 성별
- smoker: table이 smoking section에 있는가
- day: 서빙을 한 요일
- time: 서빙한 시간(Dinner, Lunch)
- size: 함께와서 식사한 사람 수 (명)library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
data(tips)
ggplot(tips, aes(tip)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(tips, aes(tip)) + geom_histogram(binwidth=1)
ggplot(tips, aes(tip)) + geom_histogram(binwidth=0.5)
ggplot(tips, aes(tip)) + geom_histogram(binwidth=0.25)
ggplot(tips, aes(tip)) + geom_histogram(binwidth=0.1)
ggplot(tips, aes(tip)) + geom_histogram(binwidth=0.05)
ggplot(tips, aes(tip)) + geom_histogram(binwidth=0.01)
# 연속변수중 하나인 tip을 aes로 히스토그램 그려라
ggplot(tips, aes(tip)) + geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(tips, aes(tip)) +
geom_freqpoly(binwidth = 0.25)
# 도수 다각형
ggplot(tips, aes(tip)) + geom_density()
ggplot(tips, aes(tip)) +
geom_density(bw = 0.1)#binwidth 를 줄여서 bw로 가능
# 전체 그래프 아래 면적이 1이 된다는 제한을 갖고 확률밀도함수를 구하는것
ggplot(tips,aes(time))+geom_bar()
ggplot(tips,aes(day))+geom_bar()
# 범주에 따라서
level_order <- factor(tips$day, level=c("Thur", "Fri", "Sat", "Sun"))
ggplot(tips, aes(level_order))+geom_bar()+xlab("Day")
#ggplot(tips, aes(level_order, day)) + geom_point()+xlab("Day")+ylab("Count")
level_order <- c("Thur", "Fri", "Sat", "Sun")
TipsDay<-as.data.frame(table(tips$day)) # data frame으로 변환
colnames(TipsDay) <- c('day','count') # 컬럼명 부여
#TipsDay<-data.frame(
# day=factor(c("Thur","Fri","Sat","Sun"),
# levels=c("Thur","Fri","Sat","Sun")),
# count=c(62,19,87,76))
ggplot(TipsDay, aes(x=factor(day, level=level_order), count)) + geom_bar(stat = "identity")+ xlab("Day")
# identity count데이터를 있는 그대로 사용하라 기본값이 count로되어있기때문에
# 빈도를 측정하게됨 데이터프레임으로 별도로숫자를 이미 카운트했으므로 identity로줌
ggplot(TipsDay, aes(x= factor(day, level=level_order), count)) + geom_point()+xlab("Day")
ggplot(TipsDay,aes(x="",y=count,fill=day))+
geom_bar(stat="identity")
ggplot(TipsDay,aes(x="",y=count,fill=day))+
geom_bar(stat="identity")+coord_polar("y")
#상대도수 막대 그래프와 파이차트
ggplot(tips, aes(day, tip)) +
geom_point()
geom_jitter(): 자료에 약간의 랜덤 노이즈를 더하여 겹쳐져서 그려지지 않게 하는 방법geom_boxplot(): summary statistics를 이용하여 상자그림을 그리는 방법geom_violin()ggplot(tips, aes(day, tip)) + geom_jitter() #점을 흐트려줌
ggplot(tips, aes(day, tip)) + geom_boxplot()
ggplot(tips, aes(day, tip)) + geom_violin()
ggplot(tips, aes(total_bill, colour = day)) +
geom_freqpoly(binwidth=5)
ggplot(tips, aes(total_bill, colour = day)) +
geom_density()
ggplot(tips, aes(total_bill, colour = day,fill=day)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(tips, aes(total_bill, fill = day)) +
geom_histogram() +
facet_wrap(~day, ncol = 1) # 요일에따라 그림그리면서 한줄로 나눔
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(tips, aes(total_bill, tip)) +
geom_point()
ggplot(tips, aes(total_bill, tip)) +
geom_point() +
geom_smooth() # se=FALSE 넣으면 standard error 밴드없어짐
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
geom_smooth()함수의 중요한 인자는 method로 smooth line을 찾는 방법을 지정하는 것이다.
method = "loess"ggplot(tips, aes(total_bill, tip)) +
geom_point() +
geom_smooth(span = 0.1) # local smoothing옵션이 span
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(tips, aes(total_bill, tip)) +
geom_point() +
geom_smooth(span = 1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
기본 옵션이 local smoothing이고 linear model등으로 바꿀수 있음
- `method = "lm"`
ggplot(tips, aes(total_bill, tip)) +
geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
titanic호 승객들의 운명에 관한 정보를 아래의 변수들로 요약한 자료
변수들
- Class: 1st, 2nd, 3rd, Crew
- Sex: Male, Female
- Age: Child, Adult
- Survived: No, Yesdata(Titanic)
Titanic
## , , Age = Child, Survived = No
##
## Sex
## Class Male Female
## 1st 0 0
## 2nd 0 0
## 3rd 35 17
## Crew 0 0
##
## , , Age = Adult, Survived = No
##
## Sex
## Class Male Female
## 1st 118 4
## 2nd 154 13
## 3rd 387 89
## Crew 670 3
##
## , , Age = Child, Survived = Yes
##
## Sex
## Class Male Female
## 1st 5 1
## 2nd 11 13
## 3rd 13 14
## Crew 0 0
##
## , , Age = Adult, Survived = Yes
##
## Sex
## Class Male Female
## 1st 57 140
## 2nd 14 80
## 3rd 75 76
## Crew 192 20
apply(Titanic, 1, sum)
## 1st 2nd 3rd Crew
## 325 285 706 885
apply(Titanic, 2, sum)
## Male Female
## 1731 470
apply(Titanic, 3, sum)
## Child Adult
## 109 2092
apply(Titanic, 4, sum)
## No Yes
## 1490 711
apply(Titanic, c(1,4), sum)
## Survived
## Class No Yes
## 1st 122 203
## 2nd 167 118
## 3rd 528 178
## Crew 673 212
apply(Titanic, c(2,4), sum)
## Survived
## Sex No Yes
## Male 1364 367
## Female 126 344
apply(Titanic, c(3,4), sum)
## Survived
## Age No Yes
## Child 52 57
## Adult 1438 654
library(ggmosaic) # Haley Jeppson and Heike Hofmann만듦
Titanic.data<-as.data.frame(Titanic)
head(Titanic.data)
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
ggplot(Titanic.data)+geom_mosaic(aes(x=product(Class),weight=Freq,fill=Survived))
# 변수를 쓸때 product이라는 함수를 써서 줘야함
ggplot(Titanic.data)+geom_mosaic(aes(x=product(Survived),weight=Freq,fill=Class))
ggplot(Titanic.data)+
geom_mosaic(aes(x=product(Sex),weight=Freq,fill=Survived))
ggplot(Titanic.data)+
geom_mosaic(aes(x=product(Survived),weight=Freq,fill=Sex))
ggplot(Titanic.data)+
geom_mosaic(aes(x=product(Age),weight=Freq,fill=Survived))
ggplot(Titanic.data)+
geom_mosaic(aes(x=product(Survived),weight=Freq,fill=Age))
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:ggmosaic':
##
## happy
ggpairs(tips)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggpairs(mpg[,-c(1:2)])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Titanic.data)+
geom_mosaic(aes(x=product(Age),conds=product(Class),weight=Freq,fill=Survived)) #conds -> conditional x값을 동일하게 매핑 해줌
ggplot(Titanic.data)+
geom_mosaic(aes(x=product(Class),conds=product(Age),weight=Freq,fill=Survived))
ggplot(Titanic.data)+
geom_mosaic(aes(x=product(Age),conds=product(Sex),weight=Freq,fill=Survived))
ggplot(Titanic.data)+
geom_mosaic(aes(x=product(Sex),conds=product(Age),weight=Freq,fill=Survived))
ggplot(Titanic.data)+
geom_mosaic(aes(x=product(Sex),conds=product(Class),weight=Freq,fill=Survived))
ggplot(Titanic.data)+
geom_mosaic(aes(x=product(Class),conds=product(Sex),weight=Freq,fill=Survived))
economics data (’ggplot2` library)1967년 10월부터 2014년 4월까지의 과거 40여년간 미국의 경제 상황에 대하여 조사한 월간 자료
변수들
- date: 조사한 날짜(year-month-day)
- pce: 개인소비지출 (personal consumption expenditures, dolloars)
- pop: 전체 인구수 (total population, thousands)
- psavert: 개인 저축률 (personal savings rate)
- uempmed: 실업지속 기간의 중앙값 (median duration of unemployment, weeks)
- unemploy: 실업자 수 (number of unemployed, thousands)economics
## # A tibble: 574 × 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1967-07-01 507. 198712 12.6 4.5 2944
## 2 1967-08-01 510. 198911 12.6 4.7 2945
## 3 1967-09-01 516. 199113 11.9 4.6 2958
## 4 1967-10-01 512. 199311 12.9 4.9 3143
## 5 1967-11-01 517. 199498 12.8 4.7 3066
## 6 1967-12-01 525. 199657 11.8 4.8 3018
## 7 1968-01-01 531. 199808 11.7 5.1 2878
## 8 1968-02-01 534. 199920 12.3 4.5 3001
## 9 1968-03-01 544. 200056 11.7 4.1 2877
## 10 1968-04-01 544 200208 12.3 4.6 2709
## # … with 564 more rows
ggplot(economics, aes(date, pce)) +
geom_line() #geom_path 비슷하지만 데이터를 정렬하지 않음
ggplot(economics, aes(date, psavert)) +
geom_line()
ggplot(economics, aes(date, unemploy)) +
geom_line()
ggplot(economics, aes(date, unemploy / pop)) +
geom_line()
ggplot(economics, aes(date, uempmed)) +
geom_line()
geom_text를 이용하여 그림 안에 글자를 넣기df <- data.frame(trt = c("a", "b", "c"), resp = c(1.2, 3.4, 2.5))
ggplot(df, aes(resp, trt)) +
geom_text(aes(label = trt))
family: 문자의 폰트를 지정. (“sans”(기본값), “serif”, “mono”)df <- data.frame(trt = c("a", "b", "c"), resp = c(1.2, 3.4, 2.5))
ggplot(df, aes(resp, trt)) +
geom_text(aes(label = trt),family="serif")
fontface:서체를 지정. (“plain”(기본값), “bold”, “italic”)df <- data.frame(trt = c("a", "b", "c"), resp = c(1.2, 3.4, 2.5))
ggplot(df, aes(resp, trt)) +
geom_text(aes(label = trt),fontface="italic")
hjust (“left”, “center”(기본값), “right”, “inward”, “outward”)
vjust (“bottom”, “middle”(기본값), “top”, “inward”, “outward”)
df <- data.frame(trt = c("a", "b", "c"), resp = c(1.2, 3.4, 2.5))
ggplot(df, aes(resp, trt)) +
geom_point()+
geom_text(aes(label = trt),hjust="right",vjust="bottom")
df <- data.frame(
x = c(1, 1, 2, 2, 1.5),
y = c(1, 2, 1, 2, 1.5),
text = c( "bottom-left","bottom-right","top-left","top-right","center"),
S=c(6,8,10,12,14),
a=c(0,30,45,60,90)
)
ggplot(df, aes(x, y)) +
geom_text(aes(label = text))
ggplot(df, aes(x, y)) +
geom_text(aes(label = text), vjust = "inward", hjust = "inward")
size: 글씨의 크기angle: 문자의 회전ggplot(df, aes(x,y)) +
geom_point() +
geom_text(aes(label = text),size=7,angle=30)
ggplot(df, aes(x,y)) +
geom_point() +
geom_text(aes(label = text,size=S,angle=a))+
xlim(0,3)+ylim(0,3)
check_overlap = TRUE를 이용하면 겹쳐지는 label을 제거해 준다. 단, 자료의 순서대로 label을 쓰면서 겹치는 부분을 제거하므로 뒤쪽 자료의 label이 제거됨.ggplot(mpg, aes(displ, hwy)) +
geom_text(aes(label = model)) +
xlim(1, 8)
#겹치는것을 막아줌
ggplot(mpg, aes(displ, hwy)) +
geom_text(aes(label = model), check_overlap = TRUE) +
xlim(1, 8)
directlables 패키지의 geom_dl() 함수를 이용하면 범주의 이름을 적절한 위치에 써준다.ggplot(mpg, aes(displ, hwy, colour = class)) +
geom_point()
ggplot(mpg, aes(displ, hwy, colour = class)) +
geom_point(show.legend = FALSE) +
directlabels::geom_dl(aes(label = class), method = "smart.grid")
geom_line(): 자료를 x변수의 순서대로 sorting 후 점을 이어줌.geom_path(): 자료에 나타난 순서대로 점을 이어줌.df <- data.frame(
x = c(6, 2, 12),
y = c(4, 8, 12),
label = c("a","b","c")
)
df
## x y label
## 1 6 4 a
## 2 2 8 b
## 3 12 12 c
p <- ggplot(df, aes(x, y, label = label)) +
labs(x = NULL, y = NULL) # Hide axis label
p + geom_line() + ggtitle("line")
p + geom_path() + ggtitle("path")
geom_area(): geom_line으로 그려진 선과 y=0으로 만들어지는 영역을 나타냄.geom_polygon(): 자료에 나타난 순서대로 점을 잇고 마지막 점과 첫 점을 이어 생기는 영역을 나타냄.geom_rect(): x와 y 축의 범위로 지정된 네점(xmin, xmax, ymin, ymax)를 이어주는 영역을 나타냄p + geom_rect(aes(xmin=5,xmax=6,ymin=6,ymax=10))
p + geom_area() + ggtitle("area")
p + geom_polygon() + ggtitle("polygon")
geom_vline(): 지정된 위치(xintercept)에 수평선을 그려줌geom_hline(): 지정된 위치(yintercept)에 수직선을 그려줌geom_abline(): 지정된 intercept와 slope를 이용하여 직선을 그려줌p+geom_point(size=5,color="green")+
geom_vline(xintercept=5,size=2)+
geom_hline(yintercept=10,linetype=2)+
geom_abline(intercept=0,slope=1,size=1.5,linetype=2,color="red")
ggplot(economics, aes(date, unemploy)) +
geom_line()
presidential data (’ggplot2` library)1953년부터 2017년까지의 미국 대통령 임기와 집권당을 나타낸 자료
변수들
- name : 대통령 이름
- start : 임기 시작일
- end : 임기 종료일
- party : 집권 정당 presidential <- subset(presidential, start > economics$date[1])
#dplyr기능
ggplot(economics) +
geom_rect(
aes(xmin = start, xmax = end, fill = party),
ymin = -Inf, ymax = Inf, alpha = 0.2,
data = presidential
) +
geom_vline(
aes(xintercept = as.numeric(start)),
data = presidential,
colour = "grey50", alpha = 0.5
) +
geom_text(
aes(x = start, y = 2500, label = name),
data = presidential,
size = 3, vjust = 0, hjust = 0, nudge_x = 50
) +
geom_line(aes(date, unemploy)) +
scale_fill_manual(values = c("blue", "red"))
annotate()을 이용하여 그림 안에 주석 달기yrng <- range(economics$unemploy)
xrng <- range(economics$date)
# range를 줘서 위치를 지정할 것임
caption <- paste(strwrap("Unemployment rates in the US have
varied a lot over the years", 40), collapse = "\n")
#40자까지만 쓰고 줄바꿈해라
ggplot(economics) +
geom_rect(
aes(xmin = start, xmax = end, fill = party),
ymin = -Inf, ymax = Inf, alpha = 0.2,
data = presidential
) +
geom_vline(
aes(xintercept = as.numeric(start)),
data = presidential,
colour = "grey50", alpha = 0.5
) +
geom_text(
aes(x = start, y = 2500, label = name),
data = presidential,
size = 3, vjust = 0, hjust = 0, nudge_x = 50
) +
geom_line(aes(date, unemploy)) +
scale_fill_manual(values = c("blue", "red")) +
annotate("text", x = xrng[1], y = yrng[2], label = caption,
hjust = "left", vjust = "top", size = 4)
# yrng[2] -> Y축의 최대값 xrng[1]-> x축의 최소값
geom_abline()을 이용한 보조선 넣기ggplot(diamonds, aes(log10(carat), log10(price))) +
geom_bin2d() +
facet_wrap(~cut, nrow = 2)
# 점을 묶어서 표현
mod_coef <- coef(lm(log10(price) ~ log10(carat), data = diamonds))
#중량과 가격에 대한 회귀계수를 구함
ggplot(diamonds, aes(log10(carat), log10(price))) +
geom_bin2d() +
geom_abline(intercept = mod_coef[1], slope = mod_coef[2],
colour = "white", size = 1) +
facet_wrap(~cut, nrow = 2)
#전체가격에 회귀직선을 추가해서 그림
group 활용하기Oxboys data (’nlme` library)Oxford에 있는 26명 소년에 대한 자료로 소년이 나이가 들어감에 따라 키가 커가는지를 보기 위해 9번 측정한 자료
변수들
data(Oxboys, package = "nlme")
head(Oxboys)
## Grouped Data: height ~ age | Subject
## Subject age height Occasion
## 1 1 -1.0000 140.5 1
## 2 1 -0.7479 143.4 2
## 3 1 -0.4630 144.8 3
## 4 1 -0.1643 147.1 4
## 5 1 -0.0027 147.7 5
## 6 1 0.2466 150.2 6
tail(Oxboys)
## Grouped Data: height ~ age | Subject
## Subject age height Occasion
## 229 26 -0.1643 136.7 4
## 230 26 -0.0027 138.4 5
## 231 26 0.2466 138.9 6
## 232 26 0.5562 141.8 7
## 233 26 0.7781 142.6 8
## 234 26 1.0055 143.1 9
ggplot(Oxboys, aes(age, height)) +
geom_point() +
geom_line()
ggplot(Oxboys, aes(age, height, group = Subject)) +
geom_point() +
geom_line()
- `geom_point()`는 group에 관계 없이 해당 위치에 점을 찍어 준다.
- `geom_line()`은 group에 지정된 변수의 범주별로 선을 그려줌. group 지정이 없는 경우 자료 전체를 하나의 group으로 처리함.
ggplot 내 에서 지정한 aesthetic mapping은 그 이후의 layer에 모두 영향을 미친다. 단, layer 내에서 지정된 aesthetic mapping이 있는 경우 layer 내의 지정이 우선 적용된다.
group = Subject를 ggplot 에서 지정한 경우와 geom_line에서 지정한 경우의 차이 비교
ggplot(Oxboys, aes(age, height, group = Subject)) +
geom_line() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
ggplot(Oxboys, aes(age, height)) +
geom_line(aes(group = Subject)) +
geom_smooth(method = "lm", size = 2, se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
#group을 전체로 지정해줌
*상자그림 위에 Subject별 profile line 그리기
ggplot(Oxboys, aes(Occasion, height)) +
geom_boxplot()
ggplot(Oxboys, aes(Occasion, height)) +
geom_boxplot() +
geom_line(colour = "#3366FF", alpha = 0.5)
- `geom_boxplot()`의 경우 x 변수는 범주형 변수로 x 변수가 group 으로 쓰이게 되며 x 변수의 범주별로 상자그림을 그려준다.
- Subject 별 profile line을 상자그림 위에 그리기 위해서는 `geom_line`에서 따로 `group = Subject` 지정을 해주면 된다.
ggplot(Oxboys, aes(Occasion, height)) +
geom_boxplot() +
geom_line(aes(group = Subject))
alpha 와 color 옵션을 이용ggplot(Oxboys, aes(Occasion, height)) +
geom_boxplot() +
geom_line(aes(group = Subject),
colour = "blue", alpha=0.5)
# aes안에넣으면 각각 밖에쓰면 전체 alpha-> 투명도
ggplot(Oxboys, aes(Occasion, height)) +
geom_line()
ggplot(Oxboys, aes(age, height)) +
geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
df <- data.frame(x = 1:3, y = 1:3, colour = c(1,3,5))
df
## x y colour
## 1 1 1 1
## 2 2 2 3
## 3 3 3 5
ggplot(df, aes(x, y, colour = factor(colour))) +
geom_line(aes(group = 1), size = 2) +
geom_point(size = 5)
ggplot(df, aes(x, y, colour = colour)) +
geom_line(aes(group = 1), size = 2) +
geom_point(size = 5)
ggplot(df, aes(x, y, colour = factor(colour))) +
geom_line(size = 2) + # combine all group
geom_point(size = 5)
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
df1 <- data.frame(x = 1:6, y = 1:6,
colour = c(1,1,3,3,5,5))
df1
## x y colour
## 1 1 1 1
## 2 2 2 1
## 3 3 3 3
## 4 4 4 3
## 5 5 5 5
## 6 6 6 5
ggplot(df1, aes(x, y, colour = factor(colour))) +
geom_line(size = 2) + geom_point(size = 5)
ggplot(mpg, aes(class)) +
geom_bar()
ggplot(mpg, aes(class, fill = drv)) +
geom_bar()
theme을 활용하여 x축의 label을 조정하기ggplot(mpg, aes(class, fill = drv)) +
geom_bar()+theme(axis.text.x=element_text(angle=45,hjust=1))
geom_tile(): data point가 중앙에 오는 tile을 그려줌geom_raster(stat = "identity") : tile의 크기가 모두 같게 조정geom_rect(): xmin, xmax, ymin, ymax로 지정되는 tile을 그려줌df <- data.frame(
x = c(6, 2, 12),
y = c(4, 8, 12),
label = c("a","b","c")
)
df
## x y label
## 1 6 4 a
## 2 2 8 b
## 3 12 12 c
p <- ggplot(df, aes(x, y, label = label)) +
labs(x = NULL, y = NULL)
p + geom_tile() + ggtitle("tile")
p + geom_raster() + ggtitle("raster")
## Warning: Raster pixels are placed at uneven horizontal intervals and will be
## shifted. Consider using geom_tile() instead.
p + geom_rect(aes(xmin=2.5,xmax=10,ymin=4,ymax=8)) + ggtitle("rect")
head(faithfuld)
## # A tibble: 6 × 3
## eruptions waiting density
## <dbl> <dbl> <dbl>
## 1 1.6 43 0.00322
## 2 1.65 43 0.00384
## 3 1.69 43 0.00444
## 4 1.74 43 0.00498
## 5 1.79 43 0.00542
## 6 1.84 43 0.00574
ggplot(faithfuld, aes(eruptions, waiting)) +
geom_contour(aes(z = density, colour = ..level..))
ggplot(faithfuld, aes(eruptions, waiting)) +
geom_raster(aes(fill = density))
label <- data.frame(
waiting = c(55, 80),
eruptions = c(2, 4.3),
label = c("peak one", "peak two")
)
ggplot(faithfuld, aes(waiting, eruptions)) +
geom_raster(aes(fill = density)) +
geom_label(data = label, aes(label = label))
# Bubble plots work better with fewer observations
small <- faithfuld[seq(1, nrow(faithfuld), by = 10), ]
ggplot(small, aes(eruptions, waiting)) +
geom_point(aes(size = density), alpha = 1/3) +
scale_size_area()
diamonds data (’ggplot2` library)53940개의 다이아몬드에 대한 자료
변수들
- carat: 다이아몬드 무게(lb)
- cut: 절단면에 대한 품질 (Fair, Good, VeryGood, Premium, Ideal)
- color: 다이아몬드 색깔 (D:best ~ J:worst)
- clarity: 다이아몬드의 맑고 깨끗한 정도 (I1:worst, SI2, SI1, VS2, VS1, VVS2, VVS1, IF:best)
- depth: 깊이 = 2*z/(x+y)
- table: 상단면
- price: 다이아몬드 가격(dollar)
- x: length
- y: width
- z: depthdiamonds
## # A tibble: 53,940 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # … with 53,930 more rows
ggplot(diamonds, aes(depth)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(diamonds, aes(depth)) +
geom_histogram(binwidth = 0.1) +
xlim(55, 70)
## Warning: Removed 45 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
ggplot(diamonds, aes(depth)) +
geom_histogram()+xlab(quote(paste("depth=2*",frac(z,(x+y)))))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
geom_freqpoly() 이용하여 범주별로 분포 비교하기ggplot(diamonds, aes(depth)) +
geom_freqpoly(aes(colour = cut), binwidth = 0.1, na.rm = TRUE) +
xlim(58, 68) +
theme(legend.position = "none")
geom_histogram(position = "fill") 이용하여 범주별로 분포 비교하기ggplot(diamonds, aes(depth)) +
geom_histogram(aes(fill = cut), binwidth = 0.1, position = "fill",
na.rm = TRUE) +
xlim(58, 68) +
theme(legend.position = "none")
geom_density() 이용하여 범주별로 분포 비교하기ggplot(diamonds, aes(depth, fill = cut, colour = cut)) +
geom_density(alpha = 0.2, na.rm = TRUE) +
xlim(58, 68) +
theme(legend.position = "none")
geom_boxplot()을 이용한 평행 상자그림ggplot(diamonds, aes(clarity, depth)) +
geom_boxplot()
cut_width()를 이용하여 연속변수를 범주화ggplot(diamonds, aes(carat, depth)) +
geom_boxplot(aes(group = cut_width(carat, 0.1))) +
xlim(NA, 2.05)
## Warning: Removed 997 rows containing missing values (stat_boxplot).
geom_violin()을 이용하여 범주별로 분포 비교하기ggplot(diamonds, aes(clarity, depth)) +
geom_violin()
ggplot(diamonds, aes(carat, depth)) +
geom_violin(aes(group = cut_width(carat, 0.1))) +
xlim(NA, 2.05)
## Warning: Removed 997 rows containing non-finite values (stat_ydensity).
df <- data.frame(x = rnorm(2000), y = rnorm(2000))
norm <- ggplot(df, aes(x, y)) + xlab(NULL) + ylab(NULL)
norm + geom_point()
norm + geom_point(shape = 1) # Hollow circles
norm + geom_point(shape = ".") # Pixel sized
alpha를 이용하여 점의 투명도를 조절norm + geom_point(alpha = 1 / 3)
norm + geom_point(alpha = 1 / 5)
norm + geom_point(alpha = 1 / 10)
norm + geom_bin2d()
norm + geom_bin2d(bins = 10)
norm + geom_bin2d()+scale_fill_gradient(low="black",high="white")
norm + geom_hex()
norm + geom_hex(bins = 10)
geom_bar의 기본 stat은 count임ggplot(diamonds, aes(color)) +
geom_bar()
geom_bar에서 stat="summary_bin" 이용하여 color별로 price 평균을 나타내기ggplot(diamonds, aes(color, price)) +
geom_bar(stat = "summary_bin", fun.y = mean)
## Warning: Ignoring unknown parameters: fun.y
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
geom_bin2d의 기본 stat은 count임ggplot(diamonds, aes(table, depth)) +
geom_bin2d(binwidth = 1, na.rm = TRUE) +
xlim(50, 70) + ylim(50, 70)
geom_bin2d에서 stat="summary_2d" 이용하여 table, depth 별로 price 평균을 나타내기ggplot(diamonds, aes(table, depth, z = price)) +
geom_raster(binwidth = 1, stat = "summary_2d",
fun = mean, na.rm = TRUE) +
xlim(50, 70) + ylim(50, 70)
## Warning: Raster pixels are placed at uneven horizontal intervals and will be
## shifted. Consider using geom_tile() instead.
## Warning: Raster pixels are placed at uneven vertical intervals and will be
## shifted. Consider using geom_tile() instead.
ggplot(mpg, aes(trans, cty)) +
geom_point() +
stat_summary(geom = "point", fun.y = "mean", colour = "red", size = 4)
## Warning: `fun.y` is deprecated. Use `fun` instead.
stat_에서 계산된 값으로 R 내부에서 사용되는 변수
- `..count..`: 각 범주의 관측수
- `..density..`:각 범주의 관측에 대한 비율 (percentage of total / bar width)
- `..x..`: 각 범주의 중심ggplot(diamonds, aes(price)) +
geom_histogram(binwidth = 500)
ggplot(diamonds, aes(price)) +
geom_histogram(aes(y = ..density..), binwidth = 500)
ggplot(diamonds, aes(price, colour = cut)) +
geom_freqpoly(binwidth = 500) +
theme(legend.position = "none")
ggplot(diamonds, aes(price, colour = cut)) +
geom_freqpoly(aes(y = ..density..), binwidth = 500) +
theme(legend.position = "none")
xlab() 과 ylab()을 이용ggplot(mpg, aes(cty, hwy)) +
geom_point()
ggplot(mpg, aes(cty, hwy)) +
geom_point() +
xlab("city driving (mpg)") +
ylab("highway driving (mpg)")
# Remove the axis labels with NULL
ggplot(mpg, aes(cty, hwy)) +
geom_point() +
xlab(NULL) +
ylab(NULL)
xlim() 과 ylim()ggplot(mpg, aes(drv, hwy)) +
geom_jitter(width = 0.25)
ggplot(mpg, aes(drv, hwy)) +
geom_jitter(width = 0.25) +
xlim("f", "r") +
ylim(20, 30)
## Warning: Removed 138 rows containing missing values (geom_point).
# For continuous scales, use NA to set only one limit
ggplot(mpg, aes(drv, hwy)) +
geom_jitter(width = 0.25, na.rm = TRUE) +
ylim(NA, 30)
xlab(), ylab(), labs()으로 축 이름 지정 가능ggtitle()을 이용하여 그림 제목 지정df <- data.frame(x = 1:2, y = 1, z = "a")
p <- ggplot(df, aes(x, y)) + geom_point(aes(colour = z))
p +
xlab("X axis") +
ylab("Y axis") + ggtitle("TEST Plot")
p + labs(x = "X axis", y = "Y axis", colour = "Colour\nlegend")
p <- ggplot(df, aes(x, y)) +
geom_point() +
theme(plot.background = element_rect(colour = "grey50"))
p + labs(x = "", y = "")
p + labs(x = NULL, y = NULL)
df <- data.frame(x = c(1, 3, 5) * 1000, y = 1)
axs <- ggplot(df, aes(x, y)) +
geom_point() +
labs(x = NULL, y = NULL)
axs
axs + scale_x_continuous(breaks = c(2000, 4000))
axs + scale_x_continuous(breaks = c(2000, 4000), labels = c("2k", "4k"))
df2 <- data.frame(x = 1:3, y = c("a", "b", "c"))
df2
## x y
## 1 1 a
## 2 2 b
## 3 3 c
ggplot(df2, aes(x, y)) +
geom_point()
ggplot(df2, aes(x, y)) +
geom_point() +
scale_y_discrete(labels = c(a = "apple", b = "banana", c = "carrot"))
df <- data.frame(x = 1, y = 1:3, z = letters[1:3])
p <- ggplot(df, aes(x, y, colour = z))
p + geom_point()
p + geom_point() + geom_path(aes(group = 1))
p + geom_raster(aes(fill = z))
ggplot(df, aes(y, y)) +
geom_point(size = 4, colour = "grey20", show.legend = TRUE) +
geom_point(aes(colour = z), size = 2)
p <- ggplot(mpg, aes(displ, hwy, colour = factor(cyl))) +
geom_point()
print()를 이용하여 스크린상에 나타냄.print(p)
summary()summary(p)
## data: manufacturer, model, displ, year, cyl, trans, drv, cty, hwy, fl,
## class [234x11]
## mapping: x = ~displ, y = ~hwy, colour = ~factor(cyl)
## faceting: <ggproto object: Class FacetNull, Facet, gg>
## compute_layout: function
## draw_back: function
## draw_front: function
## draw_labels: function
## draw_panels: function
## finish_data: function
## init_scales: function
## map_data: function
## params: list
## setup_data: function
## setup_params: function
## shrink: TRUE
## train_scales: function
## vars: function
## super: <ggproto object: Class FacetNull, Facet, gg>
## -----------------------------------
## geom_point: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity